rm(list = ls())

here::i_am(file.path("code", "01_iv_main.R"))
library(here)
source(here("code", "config.R"))

#################################################################################
# Load cards, census, analysis data

est_df <- read_parquet(here("data", "analysis", "analysis.parquet")) |>
  mutate(
    county_geoid_c1930 = replace_na(county_geoid_c1930, -999)
  )

census <- read_parquet(here("data", "analysis", "census.parquet"))
cards <- read_parquet(here("data", "analysis", "cards.parquet"))

#################################################################################
# Binscatter order number and order number implied by serial number

order_bs <- feols(local_order_num ~ local_order_num_from_serial, data = cards, vcov = "hetero")
slope <- paste0("Slope: ", formatC(order_bs$coefficients[2], digits = 2), " (", formatC(order_bs$se[2], digits = 2), ")")

p <- cards |>
  ggplot(aes(x = local_order_num_from_serial, y = local_order_num)) +
  stat_binscatter(bins = 15) +
  my_theme() +
  labs(
    x = "Order number inferred from serial number",
    y = "Order number on card",
    caption = slope
  ) +
  xlim(0, 4000) +
  ylim(0, 4000)

p_color <- p + geom_smooth(linewidth = .5, color = "#F8766D", method = "lm", se = FALSE)
ggsave(file.path(fig_dir, "color", "local_order_num_binscatter.pdf"), plot = p_color, width = 5, height = 3)

p_bw <- p + geom_smooth(linewidth = .5, color = "black", method = "lm", se = FALSE)
ggsave(file.path(fig_dir, "bw", "local_order_num_binscatter.pdf"), plot = p_bw, width = 5, height = 3)

#################################################################################
# First stage binscatter, table, exogeneity figure

fs_bs <- feols(vet_combined ~ order_num_serial_norm, data = est_df, cluster = ~ serial_num)
slope <- paste0("Slope: ", formatC(fs_bs$coefficients[2], digits = 2), " (", formatC(fs_bs$se[2], digits = 2), ")")

# Binscatter
p <- est_df |>
  ggplot(aes(x = order_num_serial_norm, y = vet_combined)) +
  stat_binscatter(bins = 15) +
  my_theme() +
  labs(
    x = "Order number (scaled)",
    y = "Veteran",
    caption = slope
  )

p_color <- p + geom_smooth(linewidth = .5, color = "#F8766D", method = "lm", se = FALSE)
ggsave(file.path(fig_dir, "color", "fs_binscatter.pdf"), plot = p_color, width = 5, height = 3)

p_bw <-  p + geom_smooth(linewidth = .5, color = "black", method = "lm", se = FALSE)
ggsave(file.path(fig_dir, "bw", "fs_binscatter.pdf"), plot = p_bw, width = 5, height = 3)

fs <- feols(
  vet_combined ~ order_num_serial_norm |
  csw(board_identifier, birthyr_cards + bpl_cards, exemption^married_cards, farmer + laborer + farm_laborer),
  data = est_df ,
  cluster ~ serial_num
)

etable(
  fs,
  tex = TRUE,
  fixef.group = list(
    "-_Birth year and state" = c("Birth year", "Birth state"),
    "-_Exemption claim $\\times$ married" = c("Exemption-Married"),
    "-_Prewar occupation" = c("Farmer", "Laborer", "Farm laborer")
  ),
  fitstat = ~ n + r2 + my,
  signif.code = c("***" = 0.01, "**" = 0.05, "*" = 0.10),
  file = file.path(tab_dir, "first_stage.tex"),
  replace = TRUE
)

# Exogeneity
outcomes <- c("vet_combined", "exemption", "married_cards", "farmer", "laborer", "farm_laborer")

exog <- feols(
  .[paste0(outcomes, "_z")] ~ order_num_serial_norm | birthyr_cards + bpl_cards + board_identifier,
  data = est_df |>
    mutate(
      across(all_of(outcomes), list(z = ~ scale(.)[,1]))
    ),
  cluster ~ serial_num
)

p <- map_dfr(exog, broom::tidy, conf.int = T, .id = "outcome") |>
  mutate(
    outcome = factor(
      outcome,
      levels = rev(paste0("lhs: ", outcomes, "_z")),
      labels = rev(c("Veteran", "Exemption", "Married", "Farmer", "Laborer", "Farm laborer"))
    )
  ) |>
  ggplot(aes(x = estimate, xmin = conf.low, xmax = conf.high, y = outcome)) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = .7) +
  my_theme() +
  xlim(-.25, .25) +
  labs(
    x = "Coefficient on order number (scaled)",
    y = ""
  )

p_color <- p +
  geom_point(size = 2, color = "#F8766D") +
  geom_linerange(color = "#F8766D")
ggsave(file.path(fig_dir, "color", "fs_exogeneity.pdf"), plot = p_color, width = 5, height = 2.5)

p_bw <- p +
  geom_point(size = 2) +
  geom_linerange()
ggsave(file.path(fig_dir, "bw", "fs_exogeneity.pdf"), plot = p_bw, width = 5, height = 2.5)

#################################################################################
# OLS full sample vs. matched

ols_est_df <- census |>
  filter(birthyr_c1930 >= 1886, birthyr_c1930 <= 1896) |>
  mutate(
    is_naacp = replace_na(is_naacp, 0),
    ownhome = ownershp_c1930 == 10,
    employed = empstat_c1930 == 10,
    literate = lit_c1930 == 4,
    # set occscore to 0 if unemployed or out of the labor force
    occscore = ifelse(empstat_c1930 %in% c(20, 30), 0, occscore_c1930),
    across(c(is_naacp, ownhome, employed, literate, occscore), list(z = \(x) scale(x)[,1]))
  )

res_full <- feols(
  c(is_naacp_z, ownhome_z, employed_z, literate_z, occscore_z) ~ vet_c1930,
  data = ols_est_df,
  vcov = "hetero"
)

res_matched <- feols(
  c(is_naacp_z, ownhome_z, employed_z, literate_z, occscore_z) ~ vet_c1930,
  data = est_df |>
    mutate(
      is_naacp = replace_na(is_naacp, 0),
      ownhome = ownershp_c1930 == 10,
      employed = empstat_c1930 == 10,
      literate = lit_c1930 == 4,
      occscore = ifelse(empstat_c1930 %in% c(20, 30), 0, occscore_c1930),
      across(c(is_naacp, ownhome, employed, literate, occscore), list(z = \(x) scale(x)[,1]))
    ),
  vcov = "hetero"
)

p_color <- bind_rows(
  map_dfr(res_full, broom::tidy, conf.int = T, .id = "spec") |> mutate(sample = "Full"),
  map_dfr(res_matched, broom::tidy, conf.int = T, .id = "spec") |> mutate(sample = "Linked"),
) |>
  filter(term == "vet_c1930TRUE") |>
  mutate(
    spec = factor(
      spec,
      levels = paste0(
        "lhs: ", c("is_naacp", "literate", "ownhome", "employed", "occscore"), "_z"),
      labels = c("NAACP", "Literate", "Owns home", "Employed", "Occup. income")
    )
  ) |>
  ggplot(aes(y = fct_rev(spec), x = estimate, xmin = conf.low,
             xmax = conf.high, color = fct_rev(sample), shape = sample)) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = .7) +
  geom_point(size = 2, position=position_dodge2(.5)) +
  geom_linerange(position=position_dodge2(.5)) +
  my_theme() +
  labs(
    x = "Coefficient on veteran",
    y = "",
    color = "Sample",
    shape = "Sample"
  ) +
  xlim(-.3, .3) +
  guides(color = guide_legend(reverse = TRUE))
ggsave(file.path(fig_dir, "color", "sample_comparison.pdf"), plot = p_color, width = 5, height = 3)

p_bw <- p_color + scale_color_manual(values = c("Full" = "black", "Linked" = "#555"))
ggsave(file.path(fig_dir, "bw", "sample_comparison.pdf"), plot = p_bw, width = 5, height = 3)

#################################################################################
# Main IV results table

ols <- feols(
  is_naacp ~ vet_combined |
    csw(board_identifier, birthyr_cards + bpl_cards, exemption^married_cards,
    farmer + laborer + farm_laborer, county_geoid_c1930),
  data = est_df,
  vcov = "hetero"
)

my_nonvet <- ols |>
  map_dbl(\(x) {
    rhs <- model.matrix(x)
    lhs <-  model.matrix(x, type = "lhs")
    mean(lhs[rhs == 0])
  }) |>
  as.numeric()

ols_coefs <- sprintf(
  "%.04f",
  ols |> map_dfr(broom::tidy) |> pull(estimate)
)

ols_t <- sprintf(
  "%.02f",
  ols |> map_dfr(broom::tidy) |> pull(statistic)
)

iv <- feols(
  is_naacp ~ 1 |
    csw(board_identifier, birthyr_cards + bpl_cards, exemption^married_cards,
        farmer + laborer + farm_laborer, county_geoid_c1930) |
    vet_combined ~ order_num_serial_norm,
  data = est_df,
  cluster = ~ serial_num
)

etable(
  iv,
  tex = TRUE,
  signif.code = c("***" = 0.01, "**" = 0.05, "*" = 0.10),
  fitstat = ~ n + r2 + ivwald,
  replace = TRUE,
  file = file.path(tab_dir, "iv_main.tex"),
  fixef.group =  list(
    "-_Birth year and state" = c("Birth year", "Birth state"),
    "-_Exemption claim $\\times$ married" = c("Exemption-Married"),
    "-_Prewar occupation" = c("Farmer", "Laborer", "Farm laborer"),
    "-_County in 1930" = c("county_geoid_c1930")
  ),
  extralines = list(
    "__Dep. var. mean (nonveterans)" = my_nonvet,
    "__OLS coefficient" = ols_coefs,
    "__OLS $t$-statistic" = ols_t
  ),
  postprocess.tex = function(x) { 
    x <- gsub("Wald (1st stage), Veteran", "First stage $F$-statistic", x, fixed = TRUE)
    return(x)
  }
)

#################################################################################
# Robustness to other definitions of veteran status

iv_vet0 <- feols(
  is_naacp ~ 1 | board_identifier + birthyr_cards + bpl_cards + exemption^married_cards +
    farmer + laborer + farm_laborer |
    vet_combined ~ order_num_serial_norm,
  data = est_df,
  cluster = ~ serial_num
)

iv_vet1 <- feols(
  is_naacp ~ 1 | board_identifier + birthyr_cards + bpl_cards + exemption^married_cards +
        farmer + laborer + farm_laborer |
    vet_c1930 ~ order_num_serial_norm,
  data = est_df,
  cluster = ~ serial_num
)

iv_vet2 <- feols(
  is_naacp ~ 1 | board_identifier + birthyr_cards + bpl_cards + exemption^married_cards +
    farmer + laborer + farm_laborer |
    vet_cards ~ order_num_serial_norm,
  data = est_df,
  cluster = ~ serial_num
)

iv_vet3 <- feols(
  is_naacp ~ 1 | board_identifier + birthyr_cards + bpl_cards + exemption^married_cards +
    farmer + laborer + farm_laborer |
    vet_intersect ~ order_num_serial_norm,
  data = est_df |>
    mutate(vet_intersect = vet_c1930 & vet_cards),
  cluster = ~ serial_num
)

ols <- feols(
  is_naacp ~ sw(vet_combined, vet_c1930, vet_cards, vet_intersect) |
    board_identifier + birthyr_cards + bpl_cards + exemption^married_cards +
    farmer + laborer + farm_laborer,
  data = est_df |>
    mutate(vet_intersect = vet_c1930 & vet_cards)
)

ols_coefs <- sprintf(
  "%.04f",
  ols |> map_dfr(broom::tidy) |> pull(estimate)
)

ols_t <- sprintf(
  "%.02f",
  ols |> map_dfr(broom::tidy) |> pull(statistic)
)

my_nonvet <- ols |>
  map_dbl(\(x) {
    rhs <- model.matrix(x)
    lhs <-  model.matrix(x, type = "lhs")
    mean(lhs[rhs == 0])
  }) |>
  as.numeric()

fstats <- sapply(
  list(iv_vet0, iv_vet1, iv_vet2, iv_vet3),
  function(x) {fitstat(x, "ivwald")[[1]]$stat}
)

mx <- est_df |>
  mutate(vet_intersect = vet_c1930 & vet_cards) |>
  summarize(
    across(c(vet_combined, vet_c1930, vet_cards, vet_intersect), mean)
  ) |>
  as.numeric()
mx <- sprintf("%.03f", mx)

etable(
  iv_vet0,
  iv_vet1,
  iv_vet2,
  iv_vet3,
  tex = TRUE,
  signif.code = c("***" = 0.01, "**" = 0.05, "*" = 0.10),
  fitstat = ~ n + r2,
  replace = TRUE,
  file = file.path(tab_dir, "iv_robustness_veteran.tex"),
  dict = c(
    "vet_combined" = "Veteran (Baseline)",
    "vet_c1930TRUE" = "Veteran (Census)",
    "vet_cardsTRUE" = "Veteran (VAMI/ATS)",
    "vet_intersectTRUE" = "Veteran (Intersection)"
  ),
  fixef.group =  list(
    "-_Birth year and state" = c("Birth year", "Birth state"),
    "-_Exemption claim $\\times$ married" = c("Exemption-Married"),
    "-_Prewar occupation" = c("Farmer", "Laborer", "Farm laborer")
  ),
  extralines = list(
    "__First stage $F$-statistic" = fstats,
    "__Dep. var. mean (nonveterans)" = my_nonvet,
    "__Mean veteran indicator" = mx,
    "__OLS coefficient" = ols_coefs,
    "__OLS $t$-statistic" = ols_t
  )
)

#################################################################################
# Other NAACP outcomes

years_coefs <- feols(
  c(in_15_17_z, in_18_19_z, in_20_24_z, in_25_29_z, in_30_34_z, in_35_40_z) ~ 1 |
    birthyr_cards + bpl_cards + board_identifier +
    exemption^married_cards + farmer + laborer + farm_laborer |
    vet_combined ~ order_num_serial_norm,
  data = est_df |>
    mutate(
      naacp_year_min = ifelse(naacp_year_min < 1915, 1915, naacp_year_min),
      naacp_year_max = ifelse(naacp_year_max > 1940, 1940, naacp_year_max),
      in_15_17 = naacp_year_min <= 1917,
      in_18_19 = (naacp_year_min <= 1919) & (naacp_year_max >= 1918),
      in_20_24 = (naacp_year_min <= 1924) & (naacp_year_max >= 1920),
      in_25_29 = (naacp_year_min <= 1929) & (naacp_year_max >= 1925),
      in_30_34 = (naacp_year_min <= 1934) & (naacp_year_max >= 1930),
      in_35_40 = (naacp_year_max >= 1935),
      across(c(in_15_17, in_18_19, in_20_24, in_25_29, in_30_34, in_35_40), list(z = \(x)scale(replace_na(as.integer(x), 0))[,1]))
    ),
  cluster = ~ serial_num
) |>
  map_dfr(broom::tidy, conf.int = TRUE, .id = "spec")
  
p <- years_coefs |>
  mutate(
    year = c(1916, 1918.5, 1922, 1927, 1932, 1937)
  ) |>
  ggplot(aes(x = year, y = estimate, ymin = conf.low, ymax = conf.high)) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = .7) +
  my_theme() +
  labs(
    x = "NAACP member in",
    y = "TSLS coefficient on veteran"
  ) +
  scale_x_continuous(
    breaks = c(1916, 1918.5, 1922, 1927, 1932, 1937),
    labels = c("Before\n1917", "1918-\n1919", "1920-\n1924", "1925-\n1929", "1930-\n1934", "1935-\n1940")
  )

p_color <- p +
  geom_linerange(color = "#F8766D") +
  geom_point(size = 2, color = "#F8766D")
ggsave(file.path(fig_dir, "color", "naacp_year.pdf"), plot = p_color, width = 5, height = 3)

p_bw <- p +
  geom_linerange() +
  geom_point(size = 2)
ggsave(file.path(fig_dir, "bw", "naacp_year.pdf"), plot = p_bw, width = 5, height = 3)

res_n_years <- map_dfr(
  1:15,
  function(n_years) {
    feols(
      num_years_naacp_gt ~ 1 |
        birthyr_cards + bpl_cards + board_identifier +
        exemption^married_cards + farmer + laborer + farm_laborer |
        vet_combined ~ order_num_serial_norm,
      data = est_df |>
        mutate(
          num_years_naacp_gt = replace_na(
            (1 + naacp_year_max - naacp_year_min) >= n_years, 0)
        ),
      cluster = ~ serial_num
    ) |>
      broom::tidy(conf.int = TRUE) |>
      mutate(n_years = n_years)
  }
)

p <- res_n_years |>
  ggplot(aes(x = n_years, y = estimate, ymin = conf.low, ymax = conf.high)) +
  geom_hline(yintercept = 0) +
  geom_ribbon(alpha = .05, color = "#777777", linetype = "dashed") +
  geom_line() +
  geom_point() +
  my_theme() +
  labs(
    x = "Number of years in NAACP",
    y = "TSLS coefficient on veteran"
  ) +
  scale_x_continuous(breaks = seq(1, 15, 1))

ggsave(file.path(fig_dir, "color", "naacp_years.pdf"), plot = p, width = 5, height = 3)
ggsave(file.path(fig_dir, "bw", "naacp_years.pdf"), plot = p, width = 5, height = 3)

res_n_years_unique <- map_dfr(
  1:10,
  function(n_years) {
    feols(
      num_years_naacp_gt ~ 1 |
        birthyr_cards + bpl_cards + board_identifier +
        exemption^married_cards + farmer + laborer + farm_laborer |
        vet_combined ~ order_num_serial_norm,
      data = est_df |>
        mutate(
          num_years_naacp_gt = replace_na(
            naacp_year_nunique >= n_years, 0)
        ),
      cluster = ~ serial_num
    ) |>
      broom::tidy(conf.int = TRUE) |>
      mutate(n_years = n_years)
  }
)

p <- res_n_years_unique |>
  ggplot(aes(x = n_years, y = estimate, ymin = conf.low, ymax = conf.high)) +
  geom_hline(yintercept = 0) +
  geom_ribbon(alpha = .05, color = "#777777", linetype = "dashed") +
  geom_line() +
  geom_point() +
  my_theme() +
  labs(
    x = "Number of times in NAACP rosters",
    y = "TSLS coefficient on veteran"
  ) +
  scale_x_continuous(breaks = seq(1, 15, 1))

ggsave(file.path(fig_dir, "color", "naacp_years_nunique.pdf"), plot = p, width = 5, height = 3)
ggsave(file.path(fig_dir, "bw", "naacp_years_nunique.pdf"), plot = p, width = 5, height = 3)
